This file contains the scripts used for a contrastive analysis of English and Norwegian future constructions.

Preliminaries

Load packages

library(tidyverse)
library(readxl)
library(party)
library(lattice)
library(Hmisc)
library(pdp)
library(collostructions) # available at sfla.ch
library(patchwork)
library(ggparty)
library(Boruta)
library(svglite)

Queries

The English data were queried via CWB. SpokenBNC2014 was transformed to CWB format using this script. OANC was transformed to CWB using this script. Uncollapse the following code box to see the queries.

# Spoken BNC 2014
cqp -e
SPOKENBNC2014
[word="will|shall|going|wo|'?ll|gon"][word="to|n't|na"]?
cat Last > "will_shall_going_to_SPOKENBNC2014.txt"

# in addition, for taking priming effects into account:
cqp -e
SPOKENBNC2014;
set context 150 words;
show +u_who;
set ld "üüü";
set rd "üüü";
[word="will|shall|going|wo|'?ll|gon"][word="to|n't|na"]?;
cat Last > will_shall_going_to_SPOKENBNC2014_more_context.txt

In the text file, replace üüü by \t.

# OANC
cqp -e
OANCSPOKEN
set context s
show +lemma +pos
set PrintStructures "text_id, text_genre, text_file, turn_id, turn_age, turn_sex"
A = [word="will|shall|going|wo|ll|gon|gonna"][word="to|n't|na"]?
cat A > "oanc_spoken.txt"

# additional query for 'll files because the apostrophe
# was missing in the original query
cqp -e
OANCSPOKEN
set context s
show +lemma +pos
set PrintStructures "text_id, text_genre, text_file, turn_id, turn_age, turn_sex"
A = [word="'ll"]
cat A > "oanc_ll.txt"

Data wrangling

Read data

# read data: Norwegian
nor_bb <- read_xlsx("data/Norwegian/nota_bb_vil_skal_komer_til_a.xlsx", sheet = "bigbrother")
nor_nota <- read_xlsx("data/Norwegian/nota_bb_vil_skal_komer_til_a.xlsx", sheet = "nota")
nor <- rbind(nor_nota, nor_bb)

# clean column names
colnames(nor) <- gsub(" ", "_", colnames(nor))

# read data: English
oanc <- read_csv("data/English/OANC/oanc_spoken_going_to_shall_will_5000.csv")
bnc  <- read_xlsx("data/English/SpokenBNC2014/SPOKENBNC2014_spoken_will_shall_going_to_sample_5000-final-priming.xlsx")

Add annotation columns

Protasis/apodosis annotation columns are added automatically.

# add protasis/apodosis annotation to BNC
bnc$if_clause2 <- ifelse(bnc$if_clause=="if" & bnc$subordinate=="sub", "protasis", NA)
bnc$if_clause2 <- ifelse(bnc$if_clause=="if" & bnc$subordinate=="main", "apodosis", bnc$if_clause2)
bnc$if_clause2 <- ifelse(bnc$if_clause!="if", "no", bnc$if_clause2)

# add protasis/apodosis annotation to OANC
oanc$if_clause2 <- ifelse(oanc$if_clause=="if" & oanc$subordinate=="sub", "protasis", NA)
oanc$if_clause2 <- ifelse(oanc$if_clause=="if" & oanc$subordinate=="main", "apodosis", oanc$if_clause2)
oanc$if_clause2 <- ifelse(oanc$if_clause!="if", "no", oanc$if_clause2)

# add protasis/apodosis annotation to NOR
nor$if_clause2 <- ifelse(nor$`If-clause`=="if" & nor$Clause_type=="sub", "protasis", "NA")
nor$if_clause2 <- ifelse(nor$`If-clause`=="if" & nor$Clause_type=="main", "apodosis", nor$if_clause2)
nor$if_clause2 <- ifelse(nor$`If-clause`!="if", "no", nor$if_clause2)

Also, we add columns with the outcome variable to the English data, with an additional column with a binary encoding (will / going to)

# OANC ---------

# binary:
oanc$cxn <- ifelse(oanc$Key %in% c("gonna", "going"), "going_to", "will")

# more fine-grained:
oanc$cxn1 <- case_when(oanc$Key == "will" ~ "will",
                       oanc$Key == "'ll" ~ "'ll",
          oanc$Key == "shall" ~ "shall",
          oanc$Key == "wo" ~ "won't",
          oanc$Key == "going" ~ "going to",
          oanc$Key == "gonna" ~ "gonna")
oanc$cxn1 <- factor(oanc$cxn1, levels = c("will", "'ll", "shall", "won't", "going to", "gonna"))

# BNC ---------

# binary
bnc$cxn <- ifelse(bnc$Key %in% c("'ll", "will", "wo"), "will", "going to")

bnc$cxn1 <- case_when(bnc$Key == "gon" ~ "gonna",
          bnc$Key == "'ll" ~ "'ll",
          bnc$Key == "going" ~ "going to",
          bnc$Key == "will" ~ "will",
          bnc$Key == "wo" ~ "won't",
          bnc$Key == "shall" ~ "shall")

bnc$cxn1 <- factor(bnc$cxn1, levels = c("will", "'ll", "shall", "won't", "going to", "gonna"))

Omit false hits:

bnc  <- bnc %>% filter(keep == "y")
oanc <- oanc %>% filter(keep=="y")

Summary statistics

Number of hits per corpus:

# Norwegian

left_join(setNames(as.data.frame(table(nor_nota$Cx)), c("Cxn", "NoTa")),
          setNames(as.data.frame(table(nor_bb$Cx)), c("Cxn", "BB")))
# English
left_join(setNames(as.data.frame(table(bnc$cxn1)), c("Cxn", "BNC")), 
          setNames(as.data.frame(table(oanc$cxn1)), c("Cxn", "OANC")))

CART Tree & Random forest - Norwegian

Fitting a conditional inference tree and a random forest to the Norwegian data:

# Norwegian: ---------------

# relevant variables as factors
nor$Cx <- factor(nor$Cx)
nor$Negative <- factor(nor$Negative)
nor$Interrogative <- factor(nor$Interrogative)
nor$if_clause2 <- factor(nor$if_clause2)
nor$Clause_type <- factor(nor$Clause_type)

# reduce number of levels for Lexeme
nor$Lexeme <- factor(nor$Lexeme)
nor$lexeme <- fct_lump_min(nor$Lexeme, min = 50, other_level = "other")


# more descriptive name
nor$Construction <- nor$Cx

# shorter cx labels (obsolete now that we're using ggparty)
nor$cxn <- case_when(nor$Cx == "kommer" ~ "k",
          nor$Cx == "skal" ~ "s",
          nor$Cx == "vil" ~ "v")
nor$cxn <- factor(nor$cxn)

# shorter if clause label
nor$if_clause <- nor$if_clause2

# CART tree
set.seed(19851003)
tr_nor <- ctree(Construction ~ Negative+Interrogative+if_clause+Clause_type,
                data = nor)

# plot - adapted from https://ladal.edu.au/tree.html

# extract p-values
pvals <- unlist(nodeapply(tr_nor, ids = nodeids(tr_nor), function(n) info_node(n)$p.value))
pvals <- pvals[pvals <.05]

( tr_nor_plot <- ggparty(tr_nor) +
  geom_edge() +
  geom_edge_label() +
  geom_node_label(line_list = list(aes(label = splitvar),
                                   aes(label = paste0("N=", nodesize, ", p", 
                                                      ifelse(pvals < .001, "<.001", paste0("=", round(pvals, 3)))), 
                                       size = 10)),
                  line_gpar = list(list(size = 13), 
                                   list(size = 10)), 
                  ids = "inner") +
  geom_node_label(aes(label = paste0("Node ", id, ", N = ", nodesize)),
    ids = "terminal", nudge_y = -0.0, nudge_x = 0.01) +
  geom_node_plot(gglist = list(
    geom_bar(aes(x = "", fill = Construction),
             position = position_fill(), color = "black"),
    theme_minimal(),
    scale_fill_grey(start = .4, end = .9),
    scale_y_continuous(breaks = c(0, 1)),
    xlab(""), 
    ylab("Probability"),
    geom_text(aes(x = "", group = Construction,
                  label = after_stat(count)),
              stat = "count", position = position_fill(), vjust = 1.1)),
    shared_axis_labels = TRUE) )

# export as pdf (ggsave not working for ggparty objects apparently)

# png("figures/tree_NO.png", width = 10, height = 6, un = "in", res = 300)
# plot(tr_nor)
# dev.off()

For the publication, the plot has been slightly revised using Inkscape to prevent the overlap of the egde running from “negative” to the corresponding bar with the “clause_type” node.

Conditional inference tree for Norwegian.
Conditional inference tree for Norwegian.
# random forest
set.seed(19851003)
for_nor <- party::cforest(Cx ~ Negative+Interrogative+if_clause+Clause_type,
                data = nor, 
                controls = cforest_unbiased(mtry = 2, ntree = 2000))
vi <- party::varimp(for_nor, conditional = TRUE)

Plot the variable importance:

(vi_plot_nor <- as.data.frame(vi) %>% rownames_to_column() %>% ggplot(aes(y = fct_reorder(rowname, vi), x = vi)) + 
  geom_point(size = 5) + xlim(min(vi), max(vi)) +
  theme_bw() +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor = element_blank()) + xlab("Conditional variable importance") + ylab("Variable") +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)))

Check model fit

Model accuracy

# fitted values for out of bag sample
pred.nor <- predict(for_nor, OOB = T)
# proportion of correct predictions
sum(as.numeric(sapply(1:length(pred.nor), function(i) pred.nor[i] == nor$Cx[i]))) / length(pred.nor)
## [1] 0.793072

CART Tree and Random Forest - British and American English

# combine OANC and BNC2014 data

colnames(oanc)[which(colnames(oanc)=="gramm_person")] <- "Gramm_person"
colnames(oanc)[which(colnames(oanc)=="animacy")] <- "Animacy"
colnames(oanc)[which(colnames(oanc)=="question")] <- "Interrogative"
colnames(bnc)[which(colnames(bnc)=="question")] <- "Interrogative"
colnames(bnc)[which(colnames(bnc)=="subordinate")] <- "Clause_type"
colnames(oanc)[which(colnames(oanc)=="subordinate")] <- "Clause_type"
colnames(oanc)[which(colnames(oanc)=="negation")] <- "Negative"
colnames(bnc)[which(colnames(bnc)=="negation")] <- "Negative"


eng <- rbind(
  mutate(select(oanc, Left, Key, Right, Clause_type, Gramm_person, Animacy, Interrogative, Negative, if_clause, if_clause2, cxn, cxn1, Lemma), corpus = "OANC-Spoken"),
mutate(select(bnc, Left, Key, Right,  Clause_type, Gramm_person, Animacy, Interrogative, Negative, if_clause, if_clause2, cxn, cxn1, Lemma), corpus = "SpokenBNC2014")
)

# CART tree
eng$Negative <- factor(eng$Negative)
eng$Interrogative <- factor(eng$Interrogative)
eng$if_clause2 <- factor(eng$if_clause2)
eng$Clause_type <- factor(eng$Clause_type)
eng$cxn <- factor(eng$cxn)
eng$cxn1 <- factor(eng$cxn1)


# omit NAs
eng1 <- select(eng, Negative, Interrogative, if_clause2, Clause_type, cxn1, corpus, Lemma)
eng1 <- na.omit(eng1)

# abbreviate labels - obsolete using ggparty
eng1$cxn <- case_when(eng1$cxn1=="will" ~ "w",
          eng1$cxn1=="shall" ~ "s",
          eng1$cxn1=="won't" ~ "wnt",
          eng1$cxn1=="going to" ~ "gt",
          eng1$cxn1=="gonna" ~ "gna",
          eng1$cxn1=="'ll" ~ "ll")

# add lexeme
eng1$Lexeme <- factor(eng1$Lemma)

# add lexeme with reduced number of levels (bin infrequent lemmas to "other")
eng1$lexeme <- fct_lump_min(eng1$Lexeme, min = 35)


# factor
eng1$Construction <- eng1$cxn1
eng1$cxn <- factor(eng1$cxn, levels = c("w", "ll", "wnt", "s", "gt", "gna"))

# rename if-clause column to increase readability of tree diagram
eng1$if_clause <- eng1$if_clause2

# corpus as factor
eng1$corpus <- factor(eng1$corpus)

# more descriptive: use language variety instead of corpus
eng1$Variety <- ifelse(eng1$corpus == "OANC-Spoken", "AmE", "BrE")
eng1$Variety <- factor(eng1$Variety)

set.seed(1985)
tree_en <- ctree(Construction ~ Negative+Interrogative+if_clause+Clause_type+Variety,
                data = eng1)


# plot

# extract p-values
pvals <- unlist(nodeapply(tree_en, ids = nodeids(tree_en), function(n) info_node(n)$p.value))
pvals <- pvals[pvals <.05]

# plot:
( tr_en_plot <- ggparty(tree_en) +
  geom_edge() +
  geom_edge_label() +
  geom_node_label(line_list = list(aes(label = splitvar),
                                   aes(label = paste0("N=", nodesize, ", p", 
                                                      ifelse(pvals < .001, "<.001", paste0("=", round(pvals, 3)))), 
                                       size = 10)),
                  line_gpar = list(list(size = 13), 
                                   list(size = 10)), 
                  ids = "inner") +
  geom_node_label(aes(label = paste0("Node ", id, ", N = ", nodesize)),
    ids = "terminal", nudge_y = -0.0, nudge_x = 0.01) +
  geom_node_plot(gglist = list(
    geom_bar(aes(x = "", fill = Construction),
             position = position_fill(), color = "black"),
    theme_minimal(),
    scale_fill_grey(start = .4, end = .9),
    scale_y_continuous(breaks = c(0, 1)),
    xlab(""), 
    ylab("Probability"),
    geom_text(aes(x = "", group = Construction,
                  label = after_stat(count)),
              stat = "count", position = position_fill(), vjust = 1.1)),
    shared_axis_labels = TRUE) )

# save as pdf via export pane

#dev.off()
# png("tree_en.png", width = 30, height = 10, un = "in", res = 300)
# plot(tree_en)
# dev.off()

# random forest
set.seed(1985)
for_en <- party::cforest(Construction ~ Negative+Interrogative+if_clause+Clause_type+Variety,
                data = eng1, controls = cforest_unbiased(mtry = 3, ntree = 2000))
vi_en <- party::varimp(for_en, conditional = TRUE)

Visualize variable importance:

(vi_plot_en <- as.data.frame(vi_en) %>% rownames_to_column() %>% ggplot(aes(y = fct_reorder(rowname, vi_en), x = vi_en)) + 
  geom_point(size = 5) + xlim(min(vi_en), max(vi_en)) +
  theme_bw() +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor = element_blank()) + xlab("Conditional variable importance") + ylab("Variable") +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)))

# both plots
vi_plot_nor + ggtitle("Norwegian") + 
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) + 
  plot_spacer() +
  vi_plot_en + ggtitle("English") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  plot_layout(widths = c(4,.5,4))

# ggsave("figures/forest_NOR_EN.png", width = 12, height = 5)

Check model fit

Model accuracy (number of correct predictions / number of observations)

# fitted values for out of bag sample
pred.eng <- predict(for_en, OOB = T)
table(pred.eng)
## pred.eng
##     will      'll    shall    won't going to    gonna 
##        0     5873        0      473     2067      357
table(eng$cxn1)
## 
##     will      'll    shall    won't going to    gonna 
##     1383     3302      133      443     2017     1492
table(pred.eng, eng1$cxn1)
##           
## pred.eng   will  'll shall won't going to gonna
##   will        0    0     0     0        0     0
##   'll       922 3086    35     0      906   924
##   shall       0    0     0     0        0     0
##   won't      11   18     1   233       42   168
##   going to  401  156    16   210     1013   271
##   gonna      49   42    81     0       56   129
sum(as.numeric(sapply(1:length(pred.eng), function(i) pred.eng[i] == eng1$cxn1[i]))) / length(pred.eng)
## [1] 0.5086659

Boruta analysis

Norwegian

# first Boruta model for Norwegian
nor$Lexeme <- factor(nor$Lexeme)
set.seed(19551105)
boruta01 <- Boruta(Cx ~ Negative+Interrogative+if_clause+Clause_type+Lexeme,
                data = nor)

# decision
getConfirmedFormula(boruta01)
## Cx ~ Lexeme + Clause_type + if_clause + Interrogative + Negative
## <environment: 0x7fe232da3928>
plotImpHistory(boruta01)

plot(boruta01)

# Boruta model with confirmed formula
set.seed(19551105)
boruta02 <- Boruta(Cx ~ Lexeme + Clause_type + if_clause +
                     Interrogative + Negative,
                   data = nor)


(boruta_NOR <- as.data.frame(boruta02$ImpHistory) %>% pivot_longer(cols = 1:length(as.data.frame(boruta02$ImpHistory) )) %>% setNames(c("Variable", "Importance")) %>%
  mutate(Type = ifelse(str_detect(Variable, "shadow"), "Control", "Predictor")) %>%
  mutate(Type = factor(Type),
                Variable = factor(Variable)) %>%
  ggplot(aes(x = reorder(Variable, Importance, mean), y = Importance, fill = Type)) + 
  geom_boxplot() +
  theme_bw() +
  scale_fill_grey(start = .5, end = .9) +
  xlab("Variable") + ylab("Importance") + ggtitle("Norwegian") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  theme(panel.grid.major.y = element_line(linewidth = .2, color = "grey75"),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()))

English

# first Boruta model for English
set.seed(2015)
boruta_en01 <- Boruta(cxn ~ Negative+Interrogative+if_clause+Clause_type+Variety+Lexeme,
                data = eng1)


# decision
getConfirmedFormula(boruta_en01)
## cxn ~ Lexeme + Variety + Clause_type + if_clause + Interrogative + 
##     Negative
## <environment: 0x7fe1ddc4cef0>
plotImpHistory(boruta_en01)

# Boruta model with confirmed formula
boruta_en02 <- Boruta(cxn ~ Lexeme + Variety + Clause_type + if_clause + Interrogative + Negative,
                      data = eng1)

(boruta_EN <- as.data.frame(boruta_en02$ImpHistory) %>% pivot_longer(cols = 1:length(as.data.frame(boruta_en02$ImpHistory))) %>% setNames(c("Variable", "Importance")) %>%
  mutate(Type = ifelse(str_detect(Variable, "shadow"), "Control", "Predictor")) %>%
  mutate(Type = factor(Type),
                Variable = factor(Variable)) %>%
  ggplot(aes(x = reorder(Variable, Importance, mean), y = Importance, fill = Type)) + 
  geom_boxplot() +
  theme_bw() +
  scale_fill_grey(start = .5, end = .9) +
  xlab("Variable") + ylab("Importance") + ggtitle("English") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) +
  theme(panel.grid.major.y = element_line(linewidth = .2, color = "grey75"),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()))

# both plots
boruta_NOR +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  plot_spacer() + boruta_EN +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  plot_layout(guides = "collect", widths = c(4,1,4))

# ggsave("figures/boruta.png", width = 15, height = 6.5)

Collostructional analysis

(Multiple) distinctive collexeme analysis is used to find the lexical items that are most strongly associated with the three variants.

Covarying-collexeme analysis: Norwegian

mdca_nor <- nor %>% select(Cx, Lexeme) %>% as.data.frame %>% collex.covar()

mdca_nor %>% filter(SLOT1=="vil")
mdca_nor %>% filter(SLOT1=="skal")
mdca_nor %>% filter(SLOT1=="kommer")

Covarying-collexeme analysis: British English

mdca_bnc <- bnc %>% select(cxn1, Lemma) %>% as.data.frame() %>% collex.covar()

mdca_bnc %>% filter(SLOT1=="going to")
mdca_bnc %>% filter(SLOT1=="gonna")
mdca_bnc %>% filter(SLOT1=="will")
mdca_bnc %>% filter(SLOT1=="'ll")
mdca_bnc %>% filter(SLOT1=="won't")
mdca_bnc %>% filter(SLOT1=="shall")

Covarying-collexeme analysis: American English

mdca_oanc <- oanc %>% select(cxn1, Lemma) %>% as.data.frame() %>% collex.covar()

mdca_oanc %>% filter(SLOT1=="going to")
mdca_oanc %>% filter(SLOT1=="gonna")
mdca_oanc %>% filter(SLOT1=="will")
mdca_oanc %>% filter(SLOT1=="'ll")
mdca_oanc %>% filter(SLOT1=="won't")
mdca_oanc %>% filter(SLOT1=="shall")

Lemma lists

The following CWB commands give the lemma lists for SpokenBNC2014 and the spoken component of OANC:

cwb-scan-corpus spokenbnc2014 lemma > "spokenbnc2014_lemmas.tsv"
cwb-scan-corpus oancspoken lemma > "oancspoken_lemmas.tsv"

The lemma lists can be used to conduct simple collexeme analyses for the individual constructions.